from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

# 收集网站上发现的所有外链列表
allExtLinks = set()
allIntLinks = set()


def getAllExternalLinks(siteUrl):


html = urlopen(siteUrl)
bsObj = BeautifulSoup(html)
internalLinks = getInternalLinks(bsObj, splitAddress(siteUrl)[0])
externalLinks = getExternalLinks(bsObj, splitAddress(siteUrl)[0])
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
print("即将获取链接的URL是："+link)
allIntLinks.add(link)
getAllExternalLinks(link)
getAllExternalLinks("http://oreilly.com")
